In [26]:
import pandas as pd
import pickle

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
import seaborn as sns
import scipy
import math

# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

import psycopg2 as pg
import pandas.io.sql as pd_sql
import warnings
warnings.filterwarnings('ignore')
In [101]:
connection_args = {
    'host': 'localhost',  
    'dbname': 'cps',    
    'port': 5432
}

conn = pg.connect(**connection_args)

Target Variable (graduation rate) exploration

Let's look at the graduation rates of high schools in the 2016/17 academic school year.

2016/17

In [4]:
hs_records_1617 = ('''SELECT COUNT(school_id) FROM allschools_1617
                             WHERE is_high_school = 'Y'
                   ''')

cursor = conn.cursor()
cursor.execute(hs_records_1617)
print("There are %s high schools in the 2016/17 dataset" %cursor.fetchall()[0][0])

hs_gr_null_1617   = ('''SELECT COUNT(school_id) FROM allschools_1617
                             WHERE is_high_school = 'Y'
                             AND graduation_rate_school IS NULL
                     ''')

cursor = conn.cursor()
cursor.execute(hs_gr_null_1617)
print("There are %s high schools records in the 2016/17 dataset with null graduation rates" 
                  %cursor.fetchall()[0][0])

hs_gr_notnull_1617 = ('''SELECT COUNT(school_id) FROM allschools_1617
                             WHERE is_high_school = 'Y'
                             AND graduation_rate_school IS NOT NULL
                      ''')

cursor = conn.cursor()
cursor.execute(hs_gr_notnull_1617)

print("There are %s high schools records in the 2016/17 dataset with non-null graduation rates" %cursor.fetchall()[0][0])
There are 184 high schools in the 2016/17 dataset
There are 63 high schools records in the 2016/17 dataset with null graduation rates
There are 121 high schools records in the 2016/17 dataset with non-null graduation rates

2017/18

Now let's look at 2017/18

In [3]:
hs_records_1718 = ('''SELECT COUNT(school_id) FROM allschools_1718
                             WHERE is_high_school = 'Y'
                   ''')

cursor = conn.cursor()
cursor.execute(hs_records_1718)
print("There are %s high schools in the 2017/18 dataset" %cursor.fetchall()[0][0])

hs_gr_null_1718   = ('''SELECT COUNT(school_id) FROM allschools_1718
                             WHERE is_high_school = 'Y'
                             AND graduation_rate_school IS NULL
                     ''')

cursor = conn.cursor()
cursor.execute(hs_gr_null_1718)
print("There are %s high schools records in the 2017/18 dataset with null graduation rates" 
                  %cursor.fetchall()[0][0])

hs_gr_notnull_1718 = ('''SELECT COUNT(school_id) FROM allschools_1718
                             WHERE is_high_school = 'Y'
                             AND graduation_rate_school IS NOT NULL
                      ''')

cursor = conn.cursor()
cursor.execute(hs_gr_notnull_1718)

print("There are %s high schools records in the 2017/18 dataset with non-null graduation rates" %cursor.fetchall()[0][0])
There are 184 high schools in the 2017/18 dataset
There are 60 high schools records in the 2017/18 dataset with null graduation rates
There are 124 high schools records in the 2017/18 dataset with non-null graduation rates

Next, let's look at the variance of graduation rates across both years.

In [71]:
# first, create lists of the graduation rates across both years.
hs_gr_notnull_1617 = ('''SELECT graduation_rate_school FROM allschools_1617
                             WHERE is_high_school = 'Y'
                             AND graduation_rate_school IS NOT NULL
                      ''')

cursor = conn.cursor()
cursor.execute(hs_gr_notnull_1617)

grad_rates_1617 = [rate[0] for rate in cursor.fetchall()]

var_1617 = gradrate_var(grad_rates_1617)

hs_gr_notnull_1718 = ('''SELECT graduation_rate_school FROM allschools_1718
                             WHERE is_high_school = 'Y'
                             AND graduation_rate_school IS NOT NULL
                      ''')

cursor = conn.cursor()
cursor.execute(hs_gr_notnull_1718)

grad_rates_1718 = [rate[0] for rate in cursor.fetchall()]
In [72]:
#Then, let's define a class that takes an input of a list of graduation 
#rates, and computes the mean, variance, and std. for variance.
class Gradrates:
    
    def __init__(self, rate_list):
        self.rate_list = rate_list
        self.mean = sum(self.rate_list)/len(rate_list)
    
    def variance(self):
        sum_of_squared_totals = 0
        for grad_rate in self.rate_list:
            sum_of_squared_totals += (grad_rate - self.mean)**2
        grad_rate_var = sum_of_squared_totals/len(self.rate_list)
        return grad_rate_var
    
    def std(self):
        stand_dev = math.sqrt(self.variance())
        return(stand_dev)
In [73]:
grad_1617 = Gradrates(grad_rates_1617)
grad_1718 = Gradrates(grad_rates_1718)
In [74]:
print("Mean graduation rate 2016/17: %s" %grad_1617.mean)
print("Mean graduation rate 2017/18: %s" %grad_1718.mean)
Mean graduation rate 2016/17: 72.89173553719007
Mean graduation rate 2017/18: 74.82822580645163
In [79]:
print("Variance 2016/17: %s" %grad_1617.variance())
print("Variance 2017/18: %s" %grad_1718.variance())
Variance 2016/17: 371.8863779796461
Variance 2017/18: 350.00492911030176

There is more variance in 17/18 graduation rates.

There are some outliers in the data. Let's look at the values which fall outside of two standard deviations.

In [95]:
outliers_1617 = [x for x in grad_rates_1617 if 
            (abs(x - grad_1617.mean) > (grad_1617.std() * 2))]
outliers_1718 = [x for x in grad_rates_1718 if 
            (abs(x - grad_1718.mean) > (grad_1718.std() * 2))]

print("Outiers 2016/17: %s" %outliers_1617)
print("Outiers 2017/18: %s" %outliers_1718)
Outiers 2016/17: [3.1, 0.0, 1.7, 1.3, 21.4, 27.8]
Outiers 2017/18: [0.0, 33.3, 20.6, 0.8, 2.3, 23.1]

Let's find those schools in our database.

In [109]:
cursor = conn.cursor()

outliers_1617_qry = """SELECT school_id, short_name, graduation_rate_school FROM allschools_1617
                    WHERE graduation_rate_school < 28"""
cursor.execute(outliers_1617_qry)
cursor.fetchall()
Out[109]:
[(609744, 'NORTHSIDE LEARNING HS', 3.1),
 (610557, 'PATHWAYS - ASHBURN HS', 0.0),
 (609748, 'YORK HS', 1.7),
 (609783, 'JEFFERSON HS', 1.3),
 (609750, 'SIMPSON HS', 21.4),
 (609766, 'VAUGHN HS', 27.8)]
In [108]:
cursor = conn.cursor()

outliers_1718_qry = """SELECT school_id, short_name, graduation_rate_school FROM allschools_1718
                    WHERE graduation_rate_school < 28"""
cursor.execute(outliers_1718_qry)
cursor.fetchall()
Out[108]:
[(609744, 'NORTHSIDE LEARNING HS', 0.0),
 (609766, 'VAUGHN HS', 20.6),
 (609783, 'JEFFERSON HS', 0.8),
 (609748, 'YORK HS', 2.3),
 (609750, 'SIMPSON HS', 23.1)]

With some quick google searches, one can intuit the reason for these low graduations rates of these schools. Northside Learning HS is a school for students with " significant intellectual disabilities and impaired adaptive functioning"; Vaughn provides "a specialized education for high school students with cognitive, developmental and multiple disabilities"; York is a high school run out of Cook County Jail; Jefferson teaches students at Cook County Juvenile Detention Center; Simpson educates women who are pregrant or teen mothers. Pathways is part of "a national network of non-profit public schools and learning programs that partner with local school districts to reengage at-risk youth." Noble causes all, but including the graduation rates in this study is unfair and would throw off the analysis.

In [110]:
cursor = conn.cursor()

pathways = """SELECT school_id, short_name, graduation_rate_school FROM allschools_1718
                    WHERE school_id = 610557"""
cursor.execute(pathways)
cursor.fetchall()
Out[110]:
[(610557, 'PATHWAYS - ASHBURN HS', None)]

Now for some target variable histograms

In [6]:
hs_gr_1617 = '''SELECT graduation_rate_school FROM allschools_1617
                                              WHERE is_high_school = 'Y'
                                              AND graduation_rate_school IS NOT NULL'''

cursor.execute(hs_gr_1617)
gr_1617 = [gr[0] for gr in cursor.fetchall()]
sns.distplot(gr_1617, color='b')
plt.xlim(0,100)
plt.xlabel("Percentage of Students Graduating")
plt.title('CPS Graduation Rates 16/17')
Out[6]:
Text(0.5, 1.0, 'CPS Graduation Rates 16/17')
In [7]:
hs_gr_1718 = '''SELECT graduation_rate_school FROM allschools_1718
                                              WHERE is_high_school = 'Y'
                                              AND graduation_rate_school IS NOT NULL'''

cursor.execute(hs_gr_1718)
gr_1718 = [gr[0] for gr in cursor.fetchall()]
sns.distplot(gr_1718, color='orange')
plt.xlim(0, 100)
plt.xlabel("Percentage of Students Graduating")
plt.title('CPS Graduation Rates 17/18')
Out[7]:
Text(0.5, 1.0, 'CPS Graduation Rates 17/18')

After superimposing the two histograms, we see increase of grad rates in 2017/18 in comparison to 2016/17

In [8]:
hist_plt = [gr_1617, gr_1718]
color = ['blue', 'orange']
year  = ['2016/17', '2017/18']
for a,c, y in zip(hist_plt, color, year):
    sns.distplot(a, color=c, label=y)
plt.xlim(0,100)
plt.xlabel("Percentage of Students Graduating")
plt.legend()
plt.title("HS Grad Rates 16/17 and 17/18")
Out[8]:
Text(0.5, 1.0, 'HS Grad Rates 16/17 and 17/18')
In [11]:
print("Skew for 2016/17 school year: %s" %scipy.stats.skew(gr_1617))
print("Skew for 2017/18 school year: %s" %scipy.stats.skew(gr_1718))
Skew for 2016/17 school year: -1.9389214792620162
Skew for 2017/18 school year: -1.94880818014248

Since the graduation rates are both heavily skewed left, they will need to be reflected along the x-axis and log-transformed before regression. See feature engineering notebook for the transformation code. The database has a table with model ready data, which we'll work with from here on out.

Log transformed Graduation Rates

In [12]:
query = ('''SELECT * FROM hs_modeldata WHERE log_grad_rate IS NOT NULL''');
df = pd_sql.read_sql(query, conn)

The skew is much reduced with the log-transformed graduation rates.

In [26]:
print("Skew for 2016/17 after log-transformation: %s" 
        %scipy.stats.skew(df[df.school_year=='School Year 2016-2017'].log_grad_rate))
print("Skew for 2017/18 after log-transformation: %s" 
        %scipy.stats.skew(df[df.school_year=='School Year 2017-2018'].log_grad_rate))
Skew for 2016/17 after log-transformation: -0.5025009349240924
Skew for 2017/18 after log-transformation: -0.5371894528156189
In [33]:
sns.distplot(df[df.school_year=='School Year 2016-2017'].log_grad_rate, color='b')
plt.title('Log-transformed and reflected\n CPS graduation rates, 2016/17')
Out[33]:
Text(0.5, 1.0, 'Log-transformed and reflected\n CPS graduation rates, 2016/17')
In [34]:
sns.distplot(df[df.school_year=='School Year 2017-2018'].log_grad_rate, color='orange')
plt.title('Log-transformed and reflected\n CPS graduation rates, 2017/18')
Out[34]:
Text(0.5, 1.0, 'Log-transformed and reflected\n CPS graduation rates, 2017/18')
In [35]:
hist_plt_lg = [df[df.school_year=='School Year 2016-2017'].log_grad_rate, 
              df[df.school_year=='School Year 2017-2018'].log_grad_rate]
color = ['blue', 'orange']
year  = ['2016/17', '2017/18']
for a,c, y in zip(hist_plt_lg, color, year):
    sns.distplot(a, color=c, label=y)
plt.legend()
plt.title("Log Transformed CPS\n Grad Rates 16/17 and 17/18")
Out[35]:
Text(0.5, 1.0, 'Log Transformed CPS\n Grad Rates 16/17 and 17/18')
In [ ]:
Now we will take a look at the features.

Feature Exploration

In [50]:
demographics = ['student_count_total','student_count_asian',
                'student_count_asian_pacific_islander',
                'student_count_black','student_count_english_learners',
                'student_count_ethnicity_not_available',
                'student_count_hawaiian_pacific_islander', 
                'student_count_hispanic','student_count_low_income', 
                 'student_count_multi','student_count_native_american', 
                 'student_count_other_ethnicity',
                'student_count_special_ed','student_count_white']

for dem in demographics:
    hs_records_1718 = ('''SELECT SUM(%s) FROM allschools_1718
                             WHERE is_high_school = 'Y'
                       ''' %dem)
    cursor = conn.cursor()
    cursor.execute(hs_records_1718)
    print('''There are %s %s students in high schools in the 2017/18 school year'''
              %(cursor.fetchall()[0][0], dem))

# hs_gr_null_1718   = ('''SELECT COUNT(school_id) FROM allschools_1718
#                              WHERE is_high_school = 'Y'
#                              AND graduation_rate_school IS NULL
#                      ''')
There are 112394 student_count_total students in high schools in the 2017/18 school year
There are 4563 student_count_asian students in high schools in the 2017/18 school year
There are 20 student_count_asian_pacific_islander students in high schools in the 2017/18 school year
There are 42768 student_count_black students in high schools in the 2017/18 school year
There are 10893 student_count_english_learners students in high schools in the 2017/18 school year
There are 511 student_count_ethnicity_not_available students in high schools in the 2017/18 school year
There are 176 student_count_hawaiian_pacific_islander students in high schools in the 2017/18 school year
There are 52891 student_count_hispanic students in high schools in the 2017/18 school year
There are 73990 student_count_low_income students in high schools in the 2017/18 school year
There are 1206 student_count_multi students in high schools in the 2017/18 school year
There are 354 student_count_native_american students in high schools in the 2017/18 school year
There are 0 student_count_other_ethnicity students in high schools in the 2017/18 school year
There are 17677 student_count_special_ed students in high schools in the 2017/18 school year
There are 9905 student_count_white students in high schools in the 2017/18 school year
In [37]:
#pairplot is huge, so it is saved as an svg in figures/eda.
#sns.pairplot(df)
#plt.savefig('figures/EDA/pairplot_all.svg')
df_dem_subset = df[['log_grad_rate','student_count_total','student_count_asian_perc',
                'student_count_asian_pacific_islander_perc',
                'student_count_black_perc','student_count_english_learners_perc',
                'student_count_ethnicity_not_available_perc',
                'student_count_hawaiian_pacific_islander_perc', 
                'student_count_hispanic_perc','student_count_low_income_perc', 
                 'student_count_multi_perc','student_count_native_american_perc', 
                 'student_count_other_ethnicity_perc',
                'student_count_special_ed_perc','student_count_white_perc']]
sns.pairplot(df_dem_subset, height=1.2, aspect=1.5);
In [12]:
scipy.stats.skew(df['log_grad_rate'])
Out[12]:
-0.5291607698422975
In [13]:
sns.boxplot(df['log_grad_rate'])
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x110af2630>
In [14]:
sns.kdeplot(df['log_grad_rate'])
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x110b629b0>
In [16]:
#Try removing outliers outside of 3 STD of the mean.
df['log_grad_rate'].mean()
Out[16]:
3.019720043155108
In [17]:
df_remove_outliers = df[(np.abs(df.log_grad_rate-df.log_grad_rate.mean())
                        <= 3*df.log_grad_rate.std())]
In [18]:
df_remove_outliers.head()
Out[18]:
school_id school_year student_count_total fully_accessible generally_accessible student_count_asian_perc student_count_asian_pacific_islander_perc student_count_black_perc student_count_english_learners_perc student_count_ethnicity_not_available_perc student_count_hawaiian_pacific_islander_perc student_count_hispanic_perc student_count_low_income_perc student_count_multi_perc student_count_native_american_perc student_count_other_ethnicity_perc student_count_special_ed_perc student_count_white_perc classroom_langauges_count blue brown green orange pink purple red dress_code_dummie grades_offered_count charter gender_f_bn log_grad_rate zip zip_mean_income
0 400064 School Year 2016-2017 388 1 0 0.010 0.0 0.869 0.021 0.000 0.005 0.101 0.863 0.010 0.000 0.0 0.193 0.005 1 0 0 1 0 0 0 1 1 7 1 1 2.660 60616.0 64486.0
1 400105 School Year 2016-2017 322 0 0 0.000 0.0 0.975 0.000 0.000 0.000 0.012 0.814 0.009 0.003 0.0 0.193 0.000 2 0 0 1 0 0 0 1 1 4 1 0 2.868 60616.0 64486.0
2 609676 School Year 2016-2017 455 0 1 0.000 0.0 0.965 0.020 0.000 0.000 0.033 0.965 0.000 0.002 0.0 0.273 0.000 2 0 0 0 0 0 0 0 0 4 0 0 3.350 60616.0 64486.0
3 400087 School Year 2016-2017 198 0 1 0.000 0.0 0.924 0.000 0.000 0.000 0.045 0.924 0.010 0.000 0.0 0.313 0.020 1 0 0 0 0 0 0 1 1 4 1 1 3.506 60616.0 64486.0
4 609764 School Year 2016-2017 1739 0 1 0.003 0.0 0.033 0.190 0.006 0.000 0.945 0.954 0.002 0.003 0.0 0.162 0.007 2 1 0 0 0 1 0 1 0 4 0 0 2.950 60608.0 53353.0
In [19]:
sns.distplot(df_remove_outliers['log_grad_rate'])
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a282785f8>
In [20]:
df_remove_outliers_2std = df[(np.abs(df.log_grad_rate-df.log_grad_rate.mean())
                        <= 2*df.log_grad_rate.std())]
In [21]:
sns.distplot(df_remove_outliers_2std['log_grad_rate'])
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2830fa90>

Correlation

In [22]:
df.corr()
Out[22]:
school_id student_count_total fully_accessible generally_accessible student_count_asian_perc student_count_asian_pacific_islander_perc student_count_black_perc student_count_english_learners_perc student_count_ethnicity_not_available_perc student_count_hawaiian_pacific_islander_perc student_count_hispanic_perc student_count_low_income_perc student_count_multi_perc student_count_native_american_perc student_count_other_ethnicity_perc student_count_special_ed_perc student_count_white_perc classroom_langauges_count blue brown green orange pink purple red dress_code_dummie grades_offered_count charter gender_f_bn log_grad_rate zip zip_mean_income
school_id 1.000 0.156 0.088 0.006 1.441e-01 0.019 -0.176 0.144 -0.050 1.513e-01 1.103e-01 -0.081 0.106 0.175 NaN 0.057 0.218 0.223 -0.180 0.040 -0.106 0.008 -7.724e-02 -0.018 -0.028 -0.385 -0.237 -0.950 -1.678e-01 0.057 0.121 -0.022
student_count_total 0.156 1.000 -0.021 0.125 4.514e-01 0.191 -0.440 -0.013 0.203 2.756e-01 2.550e-01 -0.396 0.386 0.225 NaN -0.396 0.525 0.721 0.244 0.354 -0.246 0.279 5.828e-02 0.167 -0.034 -0.331 0.137 -0.136 -1.822e-01 -0.451 0.013 0.370
fully_accessible 0.088 -0.021 1.000 -0.658 1.851e-01 0.169 -0.223 0.129 0.103 8.277e-02 1.864e-01 -0.051 0.100 0.079 NaN -0.060 0.101 0.104 -0.030 0.140 -0.013 0.123 -5.326e-02 0.019 -0.062 -0.081 0.001 -0.102 -1.339e-01 -0.154 0.069 0.037
generally_accessible 0.006 0.125 -0.658 1.000 -4.666e-02 -0.087 0.153 -0.247 -0.021 5.538e-02 -1.782e-01 -0.038 0.025 -0.120 NaN -0.053 0.046 0.093 0.096 0.008 0.015 -0.161 -1.604e-02 0.045 0.179 -0.038 -0.088 0.005 5.469e-02 -0.028 0.006 0.064
student_count_asian_perc 0.144 0.451 0.185 -0.047 1.000e+00 0.408 -0.405 0.158 0.322 5.314e-01 8.472e-02 -0.436 0.650 0.239 NaN -0.190 0.635 0.543 0.051 0.462 -0.201 -0.020 -9.115e-04 0.116 0.160 -0.277 0.020 -0.136 -7.159e-02 -0.362 0.083 0.399
student_count_asian_pacific_islander_perc 0.019 0.191 0.169 -0.087 4.076e-01 1.000 -0.158 -0.051 0.741 1.365e-01 -6.833e-03 -0.265 0.360 0.073 NaN -0.144 0.294 0.347 -0.003 0.048 -0.072 -0.024 -3.917e-03 -0.027 -0.083 -0.119 0.108 -0.011 1.077e-02 -0.298 -0.064 0.153
student_count_black_perc -0.176 -0.440 -0.223 0.153 -4.049e-01 -0.158 1.000 -0.624 -0.186 -2.427e-01 -9.187e-01 0.202 -0.280 -0.436 NaN 0.138 -0.448 -0.424 -0.252 -0.250 0.374 -0.314 -2.070e-01 -0.134 0.181 0.237 0.133 0.165 5.780e-02 0.375 -0.048 -0.391
student_count_english_learners_perc 0.144 -0.013 0.129 -0.247 1.582e-01 -0.051 -0.624 1.000 -0.064 6.580e-02 6.827e-01 0.126 -0.105 0.196 NaN 0.353 0.004 0.015 0.018 -0.020 -0.225 0.116 1.274e-01 -0.040 -0.191 0.010 -0.096 -0.112 5.859e-02 0.154 0.119 -0.046
student_count_ethnicity_not_available_perc -0.050 0.203 0.103 -0.021 3.216e-01 0.741 -0.186 -0.064 1.000 1.318e-01 1.514e-02 -0.334 0.397 0.021 NaN -0.185 0.345 0.324 0.102 0.023 -0.096 -0.023 4.156e-02 -0.022 -0.014 -0.181 0.054 -0.024 -1.843e-02 -0.293 -0.088 0.242
student_count_hawaiian_pacific_islander_perc 0.151 0.276 0.083 0.055 5.314e-01 0.136 -0.243 0.066 0.132 1.000e+00 3.918e-02 -0.347 0.451 0.185 NaN -0.176 0.443 0.333 0.083 0.405 -0.143 -0.068 -5.506e-04 0.013 0.104 -0.166 0.078 -0.153 -7.318e-02 -0.248 -0.061 0.353
student_count_hispanic_perc 0.110 0.255 0.186 -0.178 8.472e-02 -0.007 -0.919 0.683 0.015 3.918e-02 1.000e+00 0.063 -0.060 0.356 NaN -0.106 0.080 0.188 0.210 0.083 -0.320 0.373 2.287e-01 0.088 -0.244 -0.076 -0.173 -0.089 -1.567e-04 -0.243 0.042 0.174
student_count_low_income_perc -0.081 -0.396 -0.051 -0.038 -4.364e-01 -0.265 0.202 0.126 -0.334 -3.470e-01 6.296e-02 1.000 -0.625 -0.158 NaN 0.180 -0.664 -0.479 -0.101 -0.215 0.148 0.009 1.983e-02 -0.125 -0.060 0.387 -0.160 0.144 2.086e-01 0.357 -0.018 -0.528
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
green -0.106 -0.246 -0.013 0.015 -2.006e-01 -0.072 0.374 -0.225 -0.096 -1.428e-01 -3.196e-01 0.148 -0.116 -0.146 NaN 0.031 -0.220 -0.157 -0.040 -0.023 1.000 -0.056 -4.050e-02 0.148 0.128 0.178 0.033 0.137 -1.600e-02 0.009 -0.065 -0.182
orange 0.008 0.279 0.123 -0.161 -2.039e-02 -0.024 -0.314 0.116 -0.023 -6.785e-02 3.731e-01 0.009 -0.115 0.164 NaN -0.154 -0.042 0.040 0.032 0.076 -0.056 1.000 2.186e-01 0.319 -0.046 -0.129 -0.084 0.007 3.518e-02 -0.135 -0.059 0.050
pink -0.077 0.058 -0.053 -0.016 -9.115e-04 -0.004 -0.207 0.127 0.042 -5.506e-04 2.287e-01 0.020 -0.013 -0.039 NaN -0.082 0.017 -0.020 0.247 0.187 -0.040 0.219 1.000e+00 0.338 0.100 -0.032 -0.033 0.025 6.607e-02 -0.085 -0.357 0.128
purple -0.018 0.167 0.019 0.045 1.162e-01 -0.027 -0.134 -0.040 -0.022 1.304e-02 8.783e-02 -0.125 0.189 0.096 NaN -0.086 0.139 0.118 0.234 0.460 0.148 0.319 3.383e-01 1.000 0.221 -0.106 -0.063 0.025 -4.977e-02 -0.166 -0.200 0.405
red -0.028 -0.034 -0.062 0.179 1.598e-01 -0.083 0.181 -0.191 -0.014 1.042e-01 -2.435e-01 -0.060 0.151 -0.169 NaN -0.134 0.038 0.007 -0.080 0.088 0.128 -0.046 1.001e-01 0.221 1.000 0.055 -0.161 -0.017 -9.801e-02 -0.111 -0.072 0.106
dress_code_dummie -0.385 -0.331 -0.081 -0.038 -2.767e-01 -0.119 0.237 0.010 -0.181 -1.660e-01 -7.609e-02 0.387 -0.369 -0.264 NaN 0.043 -0.447 -0.364 -0.057 -0.109 0.178 -0.129 -3.153e-02 -0.106 0.055 1.000 -0.115 0.399 1.316e-01 0.087 -0.073 -0.374
grades_offered_count -0.237 0.137 0.001 -0.088 1.999e-02 0.108 0.133 -0.096 0.054 7.846e-02 -1.728e-01 -0.160 0.155 -0.102 NaN -0.126 0.047 0.150 0.008 -0.061 0.033 -0.084 -3.301e-02 -0.063 -0.161 -0.115 1.000 0.261 -2.981e-02 0.017 -0.064 0.138
charter -0.950 -0.136 -0.102 0.005 -1.357e-01 -0.011 0.165 -0.112 -0.024 -1.527e-01 -8.885e-02 0.144 -0.134 -0.152 NaN -0.048 -0.240 -0.206 0.135 -0.026 0.137 0.007 2.490e-02 0.025 -0.017 0.399 0.261 1.000 1.618e-01 -0.076 -0.100 -0.015
gender_f_bn -0.168 -0.182 -0.134 0.055 -7.159e-02 0.011 0.058 0.059 -0.018 -7.318e-02 -1.567e-04 0.209 -0.170 -0.153 NaN 0.135 -0.162 -0.202 -0.066 -0.073 -0.016 0.035 6.607e-02 -0.050 -0.098 0.132 -0.030 0.162 1.000e+00 0.106 0.002 -0.083
log_grad_rate 0.057 -0.451 -0.154 -0.028 -3.625e-01 -0.298 0.375 0.154 -0.293 -2.475e-01 -2.433e-01 0.357 -0.414 -0.033 NaN 0.576 -0.350 -0.488 -0.155 -0.283 0.009 -0.135 -8.530e-02 -0.166 -0.111 0.087 0.017 -0.076 1.062e-01 1.000 0.070 -0.361
zip 0.121 0.013 0.069 0.006 8.327e-02 -0.064 -0.048 0.119 -0.088 -6.106e-02 4.189e-02 -0.018 -0.100 0.133 NaN 0.089 0.017 -0.076 -0.181 -0.086 -0.065 -0.059 -3.571e-01 -0.200 -0.072 -0.073 -0.064 -0.100 1.871e-03 0.070 1.000 -0.064
zip_mean_income -0.022 0.370 0.037 0.064 3.986e-01 0.153 -0.391 -0.046 0.242 3.528e-01 1.742e-01 -0.528 0.607 0.244 NaN -0.187 0.609 0.448 0.434 0.407 -0.182 0.050 1.276e-01 0.405 0.106 -0.374 0.138 -0.015 -8.284e-02 -0.361 -0.064 1.000

32 rows × 32 columns

In [23]:
# example of a better corr matrix
sns.heatmap(df.corr(), cmap="seismic", annot=False, vmin=-1, vmax=1);
In [ ]: